#Alexander F. Gazmararian
#afg2@princeton.edu
#January 9, 2024

#Purpose: plot the national rates of unionization in the coal industry

#Load packages
library(tidyverse)
library(tidylog)
library(here)
library(readxl)

# https://www.unionstats.com
# V. Industry: Union Membership, Coverage, Density, and Employment  by Industry, 1983-2021

#coal mining unionization
##create file list
files20c <- paste0("https://www.unionstats.com/ind/xls/ind_19", 83:99, ".xlsx")
files21c <- paste0("https://www.unionstats.com/ind/xls/ind_200", 1:9, ".xlsx")
files21cb <- paste0("https://www.unionstats.com/ind/xls/ind_20", 10:22, ".xlsx")
files_all <- c(files20c, files21c, files21cb)
#download files
get_union <- function(x) {
  year <- as.numeric(stringr::str_sub(x, -9, -5))
  temp <- tempfile()
  download.file(x, temp)
    out <- readxl::read_xlsx(temp, skip = 2)
  out$year <- year
  rm(temp)
  return(out)
}
union_dat <- lapply(files_all, get_union)
rename_union <- function(x) {
  names(x) <- c("cic", "industry", "obs", "employment", "members", "covered", "members_per", "coverage_per", "year")
  return(x)
}
union_dat_rename <- lapply(union_dat, rename_union)
union_df <- do.call("rbind", union_dat_rename)
saveRDS(union_df, here("data", "inter", "unionstats.rds"))
#subset to coal
coal <- subset(union_df, industry %in% c("Coal mining", "Coal mining (12)"))
#create plot
plot.union <- coal %>%
  mutate(across(c(obs:coverage_per), ~ as.numeric(.x))) %>%
  ggplot() +
  geom_line(aes(x = year, y = members_per), size = 1) +
  labs(x = "Year", y = "Percent Unionized") +
  scale_y_continuous(limits = c(0, 1), labels = scales::percent) +
  scale_x_continuous(breaks = seq(1985, 2022, 5), expand = c(0,0)) +
  theme_bw(base_size = 14) +
  theme(panel.grid = element_blank())
#Figure H4
ggsave(
  filename = here("output", "figures", "si_fig_H4_unions.png"),
  plot.union,
  dpi = 300,
  scale = 1.5,
  width = .6 * 7.5,
  height = 2.5
)
